leaguedf <- read_csv('../data_sets/S13LeagueOfLegendsData.csv',
col_types=c('c', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'c'),
col_names=c('rowno', 'Name', 'Class', 'Role', 'Tier', 'Score', 'Trend', "WinRate", "RoleRate", "PickRate", "BanRate", 'KDA', 'Patch'), skip=1) %>%
column_to_rownames("rowno") %>%
mutate(PickBanRate = PickRate + BanRate,
Patch = as.numeric(str_replace(Patch, '(.*?)_(.*?)', '')),
Role = str_to_title(Role))
leaguedf$Tier = as.factor(leaguedf$Tier) %>%
fct_relevel(c("God", "S", "A", "B", "C", "D"))
head(leaguedf, 5)
## Name Class Role Tier Score Trend WinRate RoleRate PickRate BanRate KDA
## 1 Aatrox Fighter Top S 57.63 -31.86 0.4768 0.9163 0.0662 0.1198 1.77
## 2 Ahri Mage Mid S 57.18 4.55 0.4950 0.9465 0.0581 0.0173 2.58
## 3 Akali Assassin Mid S 65.49 4.33 0.4841 0.7574 0.0811 0.1302 2.37
## 4 Akali Assassin Top C 39.63 -1.51 0.4592 0.2350 0.0255 0.1302 2.00
## 5 Akshan Marksman Mid A 49.39 0.34 0.5162 0.6603 0.0275 0.0379 2.26
## Patch PickBanRate
## 1 1 0.1860
## 2 1 0.0754
## 3 1 0.2113
## 4 1 0.1557
## 5 1 0.0654
AFTER WORKING WITH THE DATA AND DISCUSSING THE INFORMATION WITH YOUR GROUP, YOU SHOULD DESCRIBE 2 QUESTIONS THAT ARE CREATIVE AND INNOVATIVE. YOU SHOULD EXPLAIN WHY THESE QUESTIONS ARE INTERESTING AND WHY THEY DESERVE FURTHER INVESTIGATION. I ADVISE TO THINK OF REASONS WHY AN OWNER OF THE DATA MIGHT BENEFIT FROM ANSWERS TO THESE QUESTIONS. THINK OF REASONS WHY THE WORLD MAY BE INTERESTED IN THESE QUESITONS. THE PURPOSE OF THE INTRODUCTION IS TO STATE SOME INTERESTING QUESTIONS AND DEFEND THE VALUE OF THESE QUESTIONS. THIS INTRODUCTION SHOULD BE WRITTEN IN A WAY THAT SHOULD GET THE READER EXCITED ABOUT SEEING YOUR RESULTS. THIS SHOULD BE WRITTEN IN NO MORE THAN 4 PARAGRAPHS.
tempchamps <- leaguedf %>%
complete(nesting(Name, Role), Patch) %>% # This explicitly finds champions who were only played in a role significantly for less than all of the patches!
filter(!complete.cases(.)) %>%
count(Name, Role)
tempdf <- leaguedf %>%
filter(Name %in% tempchamps$Name & complete.cases(.)) %>%
group_by(Name, Role) %>%
summarize(
n = n(),
invN = 1/n(),
meanWinRate = mean(WinRate),
sdWinRate = sd(WinRate),
meanPickBan = mean(PickBanRate),
sdPickBan = sd(PickBanRate),
label = paste(Name, '\n', Role, sep = " ")
) %>%
filter(n != 23)
p1 <- tempdf %>%
ggplot() + geom_point(mapping = aes(x = meanWinRate, y = meanPickBan, alpha = invN), stroke = 0) +
geom_vline(xintercept = .50, color = 'red') +
coord_trans(x = 'log10', y = 'log10') +
labs(x = "Win Rate Average", y = "Pick Ban Rate Average", title = "Win Rate vs Pick Ban Rate for Temporary champions") +
theme(legend.position = "none")
p2 <- tempdf %>% select(Name, Role, n) %>% distinct(Name, Role, n) %>%
ggplot() +
geom_histogram(mapping = aes(x = n)) +
labs(x = "Patches Present", y = "Count", title = "Distribution of Temporary Champions")
p1 + p2
leaguedf %>%
select("Name", "PickBanRate", "WinRate", "Role", "RoleRate", "Class", "Patch") %>%
filter(!(Class == "NULL")) %>%
group_by(Role) %>%
group_map( ~ plot_ly(data = .,
x = ~ PickBanRate,
y = ~ WinRate,
color = ~ Class,
text = ~ Name,
frame = ~ Patch,
hoverinfo = "text",
type = "scatter",
mode = "markers",
marker = list(size = ~ RoleRate*5)
), .keep = TRUE) %>%
subplot(nrows = 2, shareX = TRUE, shareY=TRUE, margin=0.03) %>%
layout(showlegend = FALSE, title = 'Pick Ban Rate vs. Win Rate by Patch seperated by Role',
plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
margin = 0.07) %>%
layout(annotations = annotations)
PbrCorrelation <- MakeCorrelationDf("PickBanRate", "Pbr") %>%
group_by(Champion2) %>%
arrange(PbrCorrelation) %>%
mutate(label = case_when(
row_number() <= 2 ~ str_to_title(str_replace(Champion1, '\\.', ' ')),
row_number() > n() - 2 ~ str_to_title(str_replace(Champion1, '\\.', ' ')), # This adds a Space into the name where the . is and uncapitalizes the second role
Champion2 == "Tahm Kench.Support" & PbrCorrelation > 0.68 ~ "Senna Support", # This is an outlier so labeling is justified, especially since it helps show the part of the plot
TRUE ~ as.character(NA)
))
PbrCorrelation %>% filter(Champion2 %in% c("Tahm Kench Support", "Senna Support", "Ashe Adc")) %>%
ggplot(mapping = aes(x=Champion2, y = PbrCorrelation)) +
geom_boxplot() +
ggtitle("PBR Correlation Boxplot")+
scale_x_discrete(labels = c("Ashe Adc", "Senna Support", "Tahm Kench Support")) +
labs(x = "", y = "Pick Ban Rate Correlation Coefficient", caption = "Minimum and Maximum corelation coefficients are annotated, as well as Senna Support for Tahm Kench Support in order\n to best visualize how the strength of certain counters, replacements, and synergies effect Pick Ban Rate.") +
geom_text(aes(label = label), na.rm = TRUE, hjust = -0.1, size = 3, check_overlap = T)
leaguedf %>%
group_by(Name) %>%
summarise(Mean_pick=mean(PickRate, na.rm = TRUE), Std_pick=sd(PickRate, na.rm=TRUE), Mean_win=mean(WinRate, na.rm = TRUE), Std_win=sd(WinRate, na.rm=TRUE)) %>%
arrange(desc(Mean_pick)) %>%
ggplot(aes(Mean_pick, Std_win)) + geom_point() + labs(x = "Mean Pick Rate", y = "Standard Deviation Win Rate") +
coord_trans(x = 'log10', y = 'log10') +
geom_smooth(aes(x = Mean_pick, y = Std_win), method = 'lm', se = F) # THIS LOOKS NON LINEAR BUT IT IS LINEAR, ITS JUST ON A LOG SCALE!!!!
## `geom_smooth()` using formula = 'y ~ x'
plot1 <- leaguedf %>%
filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = PickRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Pick Rate")
plot2 <- leaguedf %>%
filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = BanRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Ban Rate")
plot3 <- leaguedf %>%
filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = WinRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Win Rate")
(plot1 / plot2/ plot3) + plot_annotation(title = "Analysis of Staple Top Champions")
IN LESS THAN 6 PARAGRAPHS, YOU SHOULD DESCRIBE THE DATA USED TO ANSWER
THE QUESTIONS. YOU SHOULD EXPLAIN WHERE THE DATA ORIGINATED. FOR
EXAMPLE, IT IS GOOD TO KNOW WHO COLLECTED THE DATA. JUST BECAUSE THE
DATA CAME FROM KAGGLE, DOESN’T MEAN KAGGLE.COM COLLECTED THE DATA. GIVE
AN IN-DEPTH DESCRIPTION OF THE SPECIFIC VARIABLES IN THE DATA REQUIRED
TO ANSWER YOUR QUESTIONS. YOU SHOULDN’T DISCUSS ALL VARIABLES IN THE
DATA IF YOU DIDN’T USE ALL VARIABLES IN THE DATA. YOU SHOULD EXPLAIN
WHAT EACH OBSERVATION REPRESENTS (I.E. PEOPLE, SCHOOLS, STATES, CITIES,
PATIENTS FROM A SPECIFIC HOSPITAL). WHAT IS THIS A SAMPLE OF? HOW MANY
OBSERVATIONS DO YOU HAVE? AFTER READING THIS SECTION, THE READER SHOULD
CLEARLY UNDERSTAND THE SOURCE AND CONTENT OF THE DATA YOU PLAN ON
UTILIZING TO ANSWER YOUR QUESTIONS THAT YOU PROPOSED IN THE
INTRODUCTION. AT LEAST ONE, DESCRIPTIVE TABLE AND AT LEAST ONE FIGURE
SHOULD BE USED HERE TO HELP THE READER UNDERSTAND WHAT THE DATA LOOKS
LIKE WITHOUT SEEING THE ENTIRE DATASET. IN ALL FIGURES AND TABLES, ONLY
THE VARIABLES OF INTEREST SHOULD BE USED.
#Cluster Analysis with K-Means
#Step 1: Normalize Data:
#First drop icky Vars and then Dummy encode Name, Class, and Role
#This is a high dimensional Data set
Normaldf <- leaguedf %>%
select(-c(Tier, Score, Trend, PickRate, BanRate)) %>%
pivot_wider(names_from = Role,
values_from = Role,
values_fn = function(x) 1,
values_fill = 0) %>%
mutate(Class = paste("Class: ", Class, sep = '')) %>%
pivot_wider(names_from = Class,
values_from = Class,
values_fn = function(x) 1,
values_fill = 0) %>%
pivot_wider(names_from = Name,
values_from = Name,
values_fn = function(x) 1,
values_fill =0) %>%
mutate(
WinRate = (WinRate - mean(WinRate))/sd(WinRate),
RoleRate = (RoleRate - mean(RoleRate))/ sd(RoleRate),
PickBanRate = (PickBanRate - mean(PickBanRate)) / sd(PickBanRate),
KDA = (KDA - mean(KDA)) / sd(KDA),
Patch = (Patch -mean(Patch)) / sd(Patch)
)
#Step 2: Clusterize the Data
data <- kmeans(Normaldf, centers = 6, nstart = 25)
leaguedf$Cluster = as.factor(data$cluster)
#Reproducibility for Graphing purposes
ordering <- leaguedf %>%
group_by(Cluster) %>%
summarize(RoleRate = mean(RoleRate)) %>%
arrange(RoleRate) %>%
mutate(transformation = row_number())
transform <- function (x) {
temp <- ordering %>%
filter(Cluster == x)
return (temp[[1, 3]])
}
leaguedf$Cluster <- sapply(leaguedf$Cluster, transform)
leaguedf <- leaguedf %>%
mutate(Cluster = as.factor(Cluster))
plot1a <- leaguedf %>%
ggplot() +
geom_point(mapping = aes(x = KDA, y = WinRate, color = Cluster), size = 0.75, alpha = 0.4) +
labs(x = "KDA", y = "Win Rate") +
theme_minimal()+
theme(legend.position = "none") +
scale_color_manual(values = KMeansPalette)
plot1b <- leaguedf %>%
ggplot() +
geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = Cluster), size = 0.75, alpha = 0.4) +
labs(x= "Pick/Ban Rate", y= "") +
theme_minimal()+
scale_color_manual(values = KMeansPalette)+
theme(legend.position = "bottom")+
guides(color = guide_legend(override.aes = list(size = 3) ) )
plot1c <- leaguedf %>%
ggplot() +
geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = Cluster), lwd = 0.5) +
labs(x = "Role", y = "Role %") +
theme_minimal() +
scale_color_manual(values = KMeansPalette)+
theme(legend.position = "none")
design <- "
12
12
12
12
33
33
33
33
44"
KMeans <- wrap_elements(plot1a + plot1b + plot1c + guide_area() +
plot_layout(design = design, guides = "collect") &
plot_annotation(title = "K Means"))
plot1a <- leaguedf %>%
ggplot() +
geom_point(mapping = aes(x = KDA, y = WinRate, color = Tier), size = 0.75, alpha = 0.4) +
labs(x = "KDA", y = "Win Rate") +
theme_minimal()+
scale_color_manual(values = TierPalette)+
theme(legend.position = "none")
plot1b <- leaguedf %>%
ggplot() +
geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = Tier), size = 0.75, alpha = 0.4) +
labs(x= "Pick/Ban Rate", y= "") +
theme_minimal()+
scale_color_manual(values = TierPalette)+
theme(legend.position = "bottom")+
guides(color = guide_legend(override.aes = list(size = 3)))
plot1c <- leaguedf %>%
ggplot() +
geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = Tier), lwd= 0.5) +
labs(x = "Role", y = "Role %") +
scale_color_manual(values = TierPalette)+
theme_minimal() +
theme(legend.position = "none")
Meta_Tiers <- wrap_elements(plot1a + plot1b + plot1c + guide_area() +
plot_layout(design = design, guides = "collect") &
plot_annotation(title = "Meta SRC Tier"))
(KMeans | Meta_Tiers) & plot_annotation(title = "Cluster Analysis") &
theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
#Hierarchical Clustering
HCluster <- hclust(dist(Normaldf))
plot(HCluster, xlab = '', sub = '', cex = .9) #Dendrogram!!!
leaguedf$HClust <- as.factor(cutree(HCluster, 5))
plot1a <- leaguedf %>%
ggplot() +
geom_point(mapping = aes(x = KDA, y = WinRate, color = HClust), size = 0.6, alpha = 0.8) +
labs(x = "KDA", y = "Win Rate") +
theme_minimal()+
theme(legend.position = "none")
plot1b <- leaguedf %>%
ggplot() +
geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = HClust), size = 0.6, alpha = 0.8) +
labs(x= "Pick/Ban Rate", y= "Win Rate") +
theme_minimal()+
theme(legend.position = "right")+
guides(color = guide_legend(override.aes = list(size = 3) ) )
plot1c <- leaguedf %>%
ggplot() +
geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = HClust), lwd = 0.5) +
labs(x = "Role", y = "Role %") +
theme_minimal() +
theme(legend.position = "none")
((plot1a | plot1b) / plot1c )&
plot_layout(guides = "collect") &
plot_annotation(title = "Hierarchical Cluster Analysis") & theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
leaguedf %>%
rename(`Hierarchical Cluster` = HClust) %>%
group_by(`Hierarchical Cluster`) %>%
summarize(
`Mean Win Rate` = mean(WinRate),
`Mean PB Rate` = mean(PickBanRate),
`Mean Role %` = mean(RoleRate),
`Mean KDA` = mean(KDA),
`Median Patch` = median(Patch),
`Number of Champs` = n_distinct(Name)) %>%
kbl() %>%
kable_classic(full_width = F, html_font = "Times New Roman")
| Hierarchical Cluster | Mean Win Rate | Mean PB Rate | Mean Role % | Mean KDA | Median Patch | Number of Champs |
|---|---|---|---|---|---|---|
| 1 | 0.4963903 | 0.0988015 | 0.5624936 | 2.089316 | 11.0 | 130 |
| 2 | 0.5124245 | 0.0760563 | 0.6885434 | 2.536914 | 15.0 | 140 |
| 3 | 0.4449148 | 0.1436213 | 0.2178475 | 2.146066 | 8.0 | 12 |
| 4 | 0.5089792 | 0.4172494 | 0.8719532 | 2.705065 | 11.0 | 13 |
| 5 | 0.4639125 | 0.1569875 | 0.9865125 | 4.143750 | 16.5 | 1 |
PCAdf <- leaguedf %>%
mutate(Class = paste("Class", Class, sep='_')) %>%
pivot_wider(
names_from = "Role",
values_from = "Role",
values_fn = function (x) 1,
values_fill = 0
) %>%
pivot_wider(
names_from = "Class",
values_from = "Class",
values_fn = function (x) 1,
values_fill = 0
) %>%
select(-c(Score, Trend, Tier, Cluster, HClust, Patch, Name)) #i'm taking Patch out since it basically messes with the scaling. Patch is uniform, so variance is maximized when patch is maximized
PCA1 <- prcomp(PCAdf, center = T, scale = F)
eigensum <- function(eigen) {
answer <- vector(length = length(eigen))
sum = 0
for (i in 1:length(eigen)) {
sum = sum + eigen[i]
answer[i] = sum
}
return(answer)
}
eigen <- PCA1$sdev^2
eigFrame <- data.frame(dim = factor(1:length(eigen)), eig = eigen, sum = eigensum(eigen))
Principles <- PCA1$rotation %>%
dimnames
PCACoef <- PCA1$rotation %>%
as_tibble() %>%
select(c(1,2,3))
eigFrame %>%
ggplot() +
geom_bar(mapping = aes(y = eig, x = dim, fill = 'green'), stat = 'identity') +
geom_bar(mapping = aes(y = sum, x = dim, fill = 'blue'), stat = 'identity', alpha = 0.3, ) +
scale_fill_manual(values = c("green" = "green", "blue" = "blue"), labels = c( "Rolling Summation", "Eigenvalue")) +
labs(fill = "Value",x = "Dimension Number", y = "Eigenvalue") +
theme(legend.position = c(0.15, 0.95),
legend.justification = "top")
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Based on the PCA Analysis, we can maybe see that we want 7-9 variables, but we don’t know. Please ask!!!!
Tiers <- unique(leaguedf$Tier)
Stepdf <- leaguedf %>%
mutate(Class = paste("Class", Class, sep='_')) %>%
pivot_wider(
names_from = "Role",
values_from = "Role",
values_fn = function (x) 1,
values_fill = 0
) %>%
pivot_wider(
names_from = "Class",
values_from = "Class",
values_fn = function (x) 1,
values_fill = 0
) %>%
select(-c(Score, Trend, Patch, Name, HClust, Cluster))
stepResults <- tibble(Tier = c('0'), results = c(list()), .rows = 0)
tiers <- unique(leaguedf$Tier)
for (tier in tiers) {
tempStep <- Stepdf %>%
mutate(Tier = Tier == tier)
stepModel <- lm(Tier ~ ., data = tempStep)
stepModel <- step(stepModel, direction = "both", trace=0)
stepResults <- stepResults %>%
add_row(Tier = tier, results = list(names(summary(stepModel)$aliased)[-1]))
}
plist = vector('list', length = length(tiers))
counter = 1
for (tier in tiers) {
testVars <- stepResults %>% filter(Tier == tier) %>% select(results)
testVars <- testVars$results[[1]]
rSquaredFrame <- tibble(features = 1:length(testVars), rSquared = 0)
tempStep <- Stepdf %>%
mutate(Tier = Tier == tier)
varlist <- c()
for (var in testVars) {
varlist <- append(varlist, var)
tempStep2 <- select(tempStep, varlist, Tier)
model <- lm(Tier ~ ., data = tempStep2)
rSquaredFrame[length(varlist), 2] = BIC(model)
}
p1 <- rSquaredFrame %>%
ggplot(mapping = aes(x = features, y = rSquared)) +
geom_point(shape = 4, color = "red") +
geom_line() +
geom_vline(xintercept = 7, color = 'blue') +
labs(x = "Number of Features", y= "BIC") +
theme_minimal()
plist[[counter]] <- p1
counter = counter + 1
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(varlist)
##
## # Now:
## data %>% select(all_of(varlist))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot <- Reduce('+', plist)
plot + plot_annotation(title = "BIC Analysis with Bi-Directional Stepwise Regression faceted by Tier", subtitle = "Verticle line at x=7")
We can see that around \(7\) feature for each tier is optimal or near optimal, so we want to find 7 variables to use. To do this, we use a Boruta model, which can naturally deal with high dimensional categorical variables in an efficient way for multi-class regressions. A boruta model generates a decision tree in a specific manner and sees how inclusion and emphasis of different variables differently affects the accuracy of a random forest model. Because of this, it is very good at feature selection for Random Forest models, of which the Ranger model which we use is a subset of. Beyond that, it measures importance by gathering Z scores of mean decrease accuracy measure (DAM).
#Feature Selection
boruta <- Boruta(Tier ~ ., data = select(leaguedf, -c(Score, Trend, Cluster, HClust, PickBanRate)))
plot(boruta, las = 2, cex.axis = 0.7)
We wamt 7 features, and knowing that Role woudl account for 5 in the stepwise regressoin, we can see that WinRate, PickRate, Role, and BanRate are the 4 most likely to be influential variables, and they would make up 8 total features, which is only slightly above what we wanted.
We choose K-Fold over LOOCV for cross validation because 1) K-Fold requires much less computation power and time and this already took 5 hours, and 2) K-Fold can better estimate model accuracy’s for machine learning by reducing bias variance, i.e. training on a smaller set will result in less overfitting. K-Fold cross validation functions as: Partition the dataset \(S\) into \(k\) sets, \(S_1, S_2, \ldots, S_k\), loop over \(1\) to \(k\) with \(i\) as the counter, next train each model on the data set \(S - S_i\), then test each model on the sample \(S_i\) and record the accuracy. When this is finished we compare the mean of each accuracy to best understand the overall standard deviation.
featurecombs <- powerSet(c('WinRate', 'PickRate', 'Role', 'BanRate'))# 2^4 = 16 so 16 samples, for 3 models each with 20 runs gives a total of 960 trained models.
featurecombs <- featurecombs[-1]
leaguedf <- leaguedf %>% select(-c(PickBanRate, HClust, Cluster, Score, Trend))
leaguedf$sample <- sample(1:nrow(leaguedf), nrow(leaguedf))
b = nrow(leaguedf)/10
leaguedf <- leaguedf %>%
mutate(sample = ceiling(sample/b))
crossdf <- data.frame(sample = rep(1:10, each = 15), svm_linear = 0, svm_radial = 0, ranger = 0, WinRate = 0, PickRate = 0, Role = 0, BanRate = 0)
fitControl <- trainControl(method='CV',
number = 3,
verboseIter=F)
#Using a function here makes sure each model stays local and so we end up with a faster training. Consider using this idea elsewhere in the file!!!
trainandtest <- function (method, traindf, testdf, control) {
if (method == "ranger" & length(colnames(traindf)) < 3) {
model <- randomForest(
Tier ~ .,
data = traindf,
mtry = 1
)
}
else {
model <- train(Tier ~ .,
data = traindf,
method = method,
trControl = control)
testdf$pred <- predict(model, testdf)
}
return(mean(testdf$Tier == testdf$pred))
}
featurecount <- 1
# Something is wrong here:
# MTRY For Ranger is auto failing with 1 feature. How do I set mtry low?
for (feature in featurecombs) {
for (i in 1:10) {
traindf <- leaguedf %>%
filter(!(sample == i)) %>%
select(all_of(feature), Tier)
testdf <- leaguedf %>%
filter(sample == i) %>%
select(all_of(feature), Tier)
#Train models, find the accuracy on the held out sample, and then record the data into the dataframe
crossdf[((i-1) * 15) + featurecount, 4] <- trainandtest("ranger", traindf, testdf, fitControl)
crossdf[((i-1) * 15) + featurecount, 2] <- trainandtest("svmLinear", traindf, testdf, fitControl)
crossdf[((i-1) * 15) + featurecount, 3] <- trainandtest("svmRadial", traindf, testdf, fitControl)
#Record the feature by putting 1 in the columns that have the feature present
crossdf[((i-1) * 15) + featurecount, 5] <- "WinRate" %in% feature
crossdf[((i-1) * 15) + featurecount, 6] <- "PickRate" %in% feature
crossdf[((i-1) * 15) + featurecount, 7] <- "Role" %in% feature
crossdf[((i-1) * 15) + featurecount, 8] <- "BanRate" %in% feature
}
featurecount <- featurecount + 1
}
## `summarise()` has grouped output by 'Model'. You can override using the
## `.groups` argument.
#TODO: Fix legends, axis, and axis labels
#Fix legends by adding colors and making manual labels
#Fix Axis labels by adding wrap_text to the feature in the top
#Fix axis by forcing them to all have the same 0 to 1 scale, so that htye look the same
p1 <- crossdf %>% filter(featurenum == 1) %>%
na.omit() %>%
ggplot() +
geom_bar(mapping = aes(x = Feature, y = Accuracy, fill = Model), stat = "identity", position = "dodge") +
theme_minimal() +
labs(x = "Features", y = "Accuracy", title = "1 Feature Models") + ylim(0, 1)
p2 <- crossdf %>% filter(featurenum == 2) %>%
na.omit() %>%
ggplot() +
geom_bar(mapping = aes(x = Feature, y = Accuracy, fill = Model), stat = "identity", position = "dodge") +
theme_minimal() +
labs(x = "Features", y = "Accuracy", title = "2 Feature Models") + ylim(0,1 )
p3 <- crossdf %>% filter(featurenum == 3) %>%
na.omit() %>%
ggplot() +
geom_bar(mapping = aes(x = Feature, y = Accuracy, fill = Model), stat = "identity", position = "dodge") +
theme_minimal() +
labs(x = "Features", y = "Accuracy", title = "3 Feature Models") + ylim(0,1 )
p4 <- crossdf %>% filter(featurenum == 4) %>%
na.omit() %>%
ggplot() +
geom_bar(mapping = aes(x = Feature, y = Accuracy, fill = Model), stat = "identity", position = "dodge") +
theme_minimal() +
labs(x = "Features", y = "Accuracy", title = "4 Feature Models") + ylim(0,1)
(p1 + p2) / (p3 + p4) + plot_layout(guides = "collect")
Ranger is best slightly with all 4 features, at 75.04% success. We will see if this is different statistically significantly from BanRate/PickRate
bestModels <- crossdf %>% na.omit() %>%
group_by(Model) %>%
slice_max(Accuracy, n=3) %>%
mutate(Model = str_to_title(str_replace(Model, '_', ' ')),
`Standard Deviation` = round(sd, digits = 4),
WinRate = as.numeric(str_detect(Feature, "WinRate")),
PickRate = as.numeric(str_detect(Feature, "PickRate")),
BanRate = as.numeric(str_detect(Feature, "BanRate")),
Role = as.numeric(str_detect(Feature, "Role")),
Accuracy = round(Accuracy, digits = 4)) %>%
select(-Feature, -featurenum, -sd) %>%
ungroup()
bestModels %>% formattable(
list(`WinRate` = formatter("span",
x ~ icontext(ifelse(x > 0, "ok", "remove"), ifelse(x > 0, "Yes", "No")),
style = x ~ style(color = ifelse(x < 1, "red", "green"))),
`PickRate` = formatter("span",
x ~ icontext(ifelse(x > 0, "ok", "remove"), ifelse(x > 0, "Yes", "No")),
style = x ~ style(color = ifelse(x < 1, "red", "green"))),
`BanRate` = formatter("span",
x ~ icontext(ifelse(x > 0, "ok", "remove"), ifelse(x > 0, "Yes", "No")),
style = x ~ style(color = ifelse(x < 1, "red", "green"))),
`Role` = formatter("span",
x ~ icontext(ifelse(x > 0, "ok", "remove"), ifelse(x > 0, "Yes", "No")),
style = x ~ style(color = ifelse(x < 1, "red", "green"))),
`Accuracy` = formatter("span", x ~ percent(x)))
)
| Model | Accuracy | Standard Deviation | WinRate | PickRate | BanRate | Role |
|---|---|---|---|---|---|---|
| Ranger | 75.04% | 0.0161 | Yes | Yes | Yes | Yes |
| Ranger | 72.17% | 0.0154 | Yes | Yes | No | Yes |
| Ranger | 66.84% | 0.0127 | Yes | Yes | Yes | No |
| Svm Linear | 73.57% | 0.0168 | Yes | Yes | Yes | Yes |
| Svm Linear | 72.61% | 0.0134 | Yes | Yes | No | Yes |
| Svm Linear | 65.07% | 0.0208 | Yes | Yes | Yes | No |
| Svm Radial | 74.22% | 0.0175 | Yes | Yes | Yes | Yes |
| Svm Radial | 73.60% | 0.0153 | Yes | Yes | No | Yes |
| Svm Radial | 66.40% | 0.0209 | Yes | Yes | Yes | No |
SVM Models are more human readable than a random forest with 5k trees, so we give preference to them.
bestModels %>% slice_max(Accuracy, n=6) %>%
mutate(meanDiff = max(Accuracy) - Accuracy, z = meanDiff/(sqrt(`Standard Deviation`^2/10 + 0.0161^2/10)), pval = round(pnorm(q=z, lower.tail=F), 4)) %>%
kbl() %>%
kable_classic(full_width = F, html_font = "cambria")
| Model | Accuracy | Standard Deviation | WinRate | PickRate | BanRate | Role | meanDiff | z | pval |
|---|---|---|---|---|---|---|---|---|---|
| Ranger | 0.7504 | 0.0161 | 1 | 1 | 1 | 1 | 0.0000 | 0.000000 | 0.5000 |
| Svm Radial | 0.7422 | 0.0175 | 1 | 1 | 1 | 1 | 0.0082 | 1.090468 | 0.1378 |
| Svm Radial | 0.7360 | 0.0153 | 1 | 1 | 0 | 1 | 0.0144 | 2.050251 | 0.0202 |
| Svm Linear | 0.7357 | 0.0168 | 1 | 1 | 1 | 1 | 0.0147 | 1.997736 | 0.0229 |
| Svm Linear | 0.7261 | 0.0134 | 1 | 1 | 0 | 1 | 0.0243 | 3.668494 | 0.0001 |
| Ranger | 0.7217 | 0.0154 | 1 | 1 | 0 | 1 | 0.0287 | 4.073607 | 0.0000 |
Hence, our Ranger model is NOT better that our Radial SVM model with all features within a p=0.05, however, all other models are. This is happy! Because we have projected our data set down by a lot and reduced the amount of variables we need and whatnot. WE can better visualize this now using
#Hypertuning parameters
#Algorithm is a greedy alg i came up with that goes as follows:
#1. Test with parameters, find the best fit, if it worst than the last then we are done, save the model
#2. Otherwise, we are not done. Take C and Sigma for the best fit of the last set of params, create an interval around them of [C - delta, C + delta], where both Delta are the distance (Cmax - Cmin)/5, and the same for Sigma
#3. If the new C interval < 0.1 in length, we are done because we have gone far enoguh. Save the model
#4. Repeat until complete.
#NOTE THAT THERE MAY BE A BUG WHERE IF WE OVERTRAIN THE MODEL (last was better than all current) WE DON'T GET THE ORIGINAL MODEL BACK!!!! POSSIBLY WANT TO FIX THIS BBG <3 But could always analyze and see cmax - cmin /2 for last C value
#Also likely has a memory leak since none of the models are locally defined.
#improvement idea to fix both issues : Define train function that returns either previous best C and best sigma OR new best C and best sigma? Locally defined since function and if we can keep best C and sigma we can train a model on those specifically!
grid <- expand.grid(C = seq(0, 5, length = 5), sigma = seq(0, 1, length = 5))
cmin = 0
cmax = 5
sigmamin = 0
sigmamax = 0.3
done = F
last_acc = 0
fitControl <- trainControl(method='CV',
number = 5,
verboseIter=F)
while (done == F) {
svmrad_fit <- train(Tier ~ .,
data = select(leaguedf, c(WinRate, PickRate, BanRate, Role, Tier)),
method="svmRadial",
trControl = fitControl,
tuneGrid = grid)
best_fit <- svmrad_fit$results %>%
slice_max(Accuracy, n=1)
print(best_fit)
if (last_acc > best_fit$Accuracy[1]) {
print("Best fit Higher then last Accuracy")
done = T
}
else {
if (best_fit$C[1] == cmax) {
cmin = cmin + cmax
cmax = cmax * 2
}
if (best_fit$sigma[1] == sigmamax) {
sigmamin = sigmamin + sigmamax
sigmamax = sigmamax *2
}
else {
cmin <- svmrad_fit$results %>%
filter(C < best_fit$C) %>%
slice_max(C, n = 1)
cmin <- cmin$C[1]
cmax <- svmrad_fit$results %>%
filter(C > best_fit$C) %>%
slice_min(C, n = 1)
cmax <- cmax$C[1]
sigmamin = svmrad_fit$results %>%
filter(sigma < best_fit$sigma) %>%
slice_max(sigma, n = 1)
sigmamin = sigmamin$sigma[1]
sigmamax = svmrad_fit$results %>%
filter(sigma > best_fit$sigma) %>%
slice_min(sigma, n =1)
sigmamax = sigmamax$sigma[1]
if (abs(cmin - cmax) < 0.01) {
print("Within 0.01")
print(abs(cmin - cmax))
done = T }
grid <- expand.grid(C = seq(cmin, cmax, length = 5), sigma = seq(sigmamin, sigmamax, length = 5))
last_acc <- best_fit$Accuracy[1]
print(paste("New cmin is: ", cmin, "New cmax is:", cmax, sep = ' '))
}
}
}
svmrad_fit <- train(Tier ~ .,
data = select(leaguedf, c(WinRate, PickRate, BanRate, Role, Tier)),
method= "svmRadial",
trControl = fitControl,
tuneGrid = grid)
saveRDS(svmrad_fit, "SVMRad_Fit.rds")
tiers = c("D", "C", "B", "A", "S", "God")
roles = c("Top","Jungle", "Mid", "Adc", "Support")
PlottingData <- leaguedf %>%
select(c(WinRate, PickRate, BanRate, Role, Tier)) %>%
mutate(Tier = fct_recode(Tier,
'1' = 'D',
'2' = 'C',
'3' = 'B',
'4' = 'A',
'5' = 'S',
'6' = 'God'
)) %>%
mutate(Tier = as.numeric(as.character(Tier)))
open3d()
#TOP
Plotting <- PlottingData %>%
filter(Role == "Top")
subtop <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Top")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]
#Make one surface for each plot and save.
Topplot <-MakeContourPlot(contour.df, contour.list, Plotting, "Top")
#Now take each surface and generate a plot, the same way a loop would, but save it to a unique variable
#Repeat the steps every single time.
#MID
Plotting <- PlottingData %>%
filter(Role == "Mid")
submid <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Mid")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]
Midplot <- MakeContourPlot(contour.df, contour.list, Plotting, "Mid")
#ADC
Plotting <- PlottingData %>%
filter(Role == "Adc")
subadc <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Adc")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]
Adcplot <- MakeContourPlot(contour.df, contour.list, Plotting, "Adc")
#SUPPORT
Plotting <- PlottingData %>%
filter(Role == "Support")
subsupport <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Support")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]
Supportplot <- MakeContourPlot(contour.df, contour.list, Plotting, "Support")
#JUNGLE
Plotting <- PlottingData %>%
filter(Role == "Jungle")
subjungle <- subsceneInfo()$id
dataList <- MakeIsoSurface(Plotting, "Jungle")
contour.list <- dataList[1][[1]]
contour.df <- dataList[2][[1]]
Jungleplot <- MakeContourPlot(contour.df, contour.list, Plotting, "Jungle")
close3d()
#There is behind the scenes work here with Flexdashboard used to create our widget below. If you want to see it, go to https://github.com/DhAiBt/STOR-320-Group-13 and look at FinalPaper/ContourPlot.nb :-)